suppressPackageStartupMessages({
library(tidyverse)
library(scran)
library(scater)
library(patchwork)
library(scDblFinder)
library(cowplot)
library(pheatmap)
library(edgeR)
library(ggrepel)
})
data.dir <- "Files_already_generated_with_Sevenbridges/D2/"
data <- read.csv(paste(data.dir, "Combined_CARNK_Sample2_RSEC_MolsPerCell.csv", sep = ""), skip = 7)
data$Cell_Index <- paste(data$Cell_Index, "_D2", sep = "")
The metadata contains Sample_Tag (nucleotide labeled antibody used to distinguish between samples), Sample_Name (patient id), multiplet status assigned by the SevenBridges pipeline
# donor 2
## table with cell information (sample barcode, donor, ...)
coldata <- read.csv(paste(data.dir, "CARNK_Sample2_Sample_Tag_Calls.csv", sep = ""), skip = 7) %>%
dplyr::mutate(invalid = (Sample_Name == "Multiplet" | Sample_Name == "Undetermined")) %>%
dplyr::mutate(donor = "D2")
coldata$Cell_Index <- paste(coldata$Cell_Index, "_D2", sep = "")
SevenBridges pre-processing pipeline report:
sb <- data.frame(Donor = "D2",
Number.of.Cells = dim(data)[1],
Number.of.Features = dim(data[, -1])[2],
Median.Number.of.Features = median(rowSums(data > 0)),
stringsAsFactors = F)
DT::datatable(sb,
class = 'compact stripe hower',
options = list(
dom = "Bfrtip",
scrollX = TRUE,
paging = FALSE,
searching = FALSE,
info = FALSE,
ordering = FALSE,
columnDefs = list(list(className = 'dt-center', targets = 0:3))), rownames = FALSE)
Number of cells in each sample:
table(coldata$Sample_Name)
##
## CARKOafter CARKObefore CARafter CARbefore KOafter KObefore
## 2740 2081 2866 2434 2719 2349
## Multiplet NTafter NTbefore Undetermined
## 2766 2880 2359 184
We define a new column containing the group of samples:
coldata <- coldata %>%
dplyr::mutate(group = ifelse(grepl("before", Sample_Name), "BeforeCoc",
ifelse(grepl("after", Sample_Name), "AfterCoc", NA)))
p1 <- coldata %>%
ggplot(aes(x = donor, fill = group)) +
geom_bar() +
labs(x = "Donor", fill = "Group")
p2 <- coldata %>%
ggplot(aes(x = donor, fill = Sample_Name)) +
geom_bar() +
labs(x = "Donor", fill = "Sample Name")
p3 <- coldata %>%
ggplot(aes(x = group, fill = Sample_Name)) +
geom_bar() +
labs(x = "Group", fill = "Sample Name")
p1 + p2 + p3
coldata %>%
ggplot(aes(x = donor, fill = group, color = Sample_Name)) +
geom_bar() +
labs(x = "Donor", fill = "Group", color = "Sample Name")
data_t <- data[, -1] %>% t()
rownames(data_t) <- colnames(data[, -1])
colnames(data_t) <- data$Cell_Index
sce <- SingleCellExperiment(assays = list(counts = data_t), colData = coldata)
rowData(sce)$Type <- ifelse(grepl("abo", rownames(sce), ignore.case = TRUE), "Antibody Capture", "Gene Expression") # 'Type' is either gene or antibody
# removing the cells with multiplet or undetermined assignment
sce <- sce[, !coldata$invalid]
coldata <- coldata %>%
dplyr::filter(!invalid)
# splitting protein from RNA
sce_split <- splitAltExps(sce, rowData(sce)$Type)
# altExpNames(sce_split)
# ADT vs RNA counts
rna_numi <- colSums(counts(sce_split))
adt_numi <- colSums(counts(altExp(sce_split)))
nUMIdf <- data.frame(rna_nUMI = rna_numi,
adt_nUMI = adt_numi)
ggplot(nUMIdf, aes(x = log10(adt_nUMI+1), y = log10(rna_nUMI+1))) +
geom_point(alpha = 0.1, size = 1) +
geom_density_2d(color = "orange") +
labs(x = "ADT Counts", y = "RNA Counts")
outliers <- perCellQCMetrics(sce_split)
libsize.drop <- isOutlier(outliers$sum, log = TRUE, type = "both", nmads = 3)
feature.drop <- isOutlier(outliers$detected, log = TRUE, type = "lower", nmads = 3)
print(data.frame(ByLibSize = sum(libsize.drop),
ByFeature = sum(feature.drop),
Discard = sum(libsize.drop | feature.drop)))
## ByLibSize ByFeature Discard
## 1 129 102 231
outliers.discard <- libsize.drop | feature.drop
#outliers.discard <- quickPerCellQC(outliers)
#colSums(as.matrix(outliers.discard))
ab.discard <- isOutlier(outliers$`altexps_Antibody Capture_detected`,
log = TRUE,
type = "lower",
min_diff = 1)
summary(ab.discard)
## Mode FALSE
## logical 20428
hist(outliers$`altexps_Antibody Capture_detected`,
col = 'grey',
main = "", xlab = "Number of detected ADTs")
colData(sce_split) <- cbind(colData(sce_split), outliers) # adding the outliers info (sum, detected) to the sce object coldata
sce_split$discard <- outliers.discard | ab.discard # adding the cells to be discarded to the sce object coldata
plotColData(sce_split, x = "detected", y = "sum", colour_by = "discard") +
theme(panel.border = element_rect(color = "grey")) +
labs(x = "Number of Features Detected", y = "Library Size")
gridExtra::grid.arrange(
plotColData(sce_split, y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Library Size"),
plotColData(sce_split, y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Genes Detected") + labs(y = "genes detected"),
plotColData(sce_split, y="altexps_Antibody Capture_detected", colour_by="discard") +
scale_y_log10() + ggtitle("Antibodies Detected") + labs(y = "antibodies detected"),
ncol=1
)
# ADT vs RNA counts
rna_numi <- colSums(counts(sce_split))
adt_numi <- colSums(counts(altExp(sce_split)))
nUMIdf <- data.frame(rna_nUMI = rna_numi,
adt_nUMI = adt_numi,
Discard = sce_split$discard)
ggplot(nUMIdf, aes(x = log10(adt_nUMI+1), y = log10(rna_nUMI+1), color = Discard)) +
geom_point(alpha = 0.1, size = 1) +
geom_density_2d(color = "black") +
labs(x = "ADT Counts", y = "RNA Counts") +
scale_color_manual(values = c("#0072B2", "#C4961A"))
# filtering; removing outliers
sce_split <- sce_split[, !sce_split$discard]
sce_split
## class: SingleCellExperiment
## dim: 431 20197
## metadata(0):
## assays(1): counts
## rownames(431): ADA ADGRE1 ... ZBTB16 ZNF683
## rowData names(2): Type scDblFinder.selected
## colnames(20197): 697359_D2 864267_D2 ... 314351_D2 855920_D2
## colData names(17): Cell_Index Sample_Tag ... total discard
## reducedDimNames(0):
## mainExpName: Gene Expression
## altExpNames(1): Antibody Capture
table(coldata$Sample_Name)
##
## CARKOafter CARKObefore CARafter CARbefore KOafter KObefore
## 2740 2081 2866 2434 2719 2349
## NTafter NTbefore
## 2880 2359
# RNA
## size factors
lib.sf <- librarySizeFactors(sce_split)
summary(lib.sf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2150 0.5897 0.8379 1.0000 1.2181 4.2694
lib.sf.df <- data.frame(lib.sf, Group = colData(sce_split)$group)
## plotting size factors
hist(log10(lib.sf), xlab="Log10[Size Factor]", col='grey80')
ggplot(lib.sf.df, aes(log10(lib.sf))) +
geom_histogram() +
facet_wrap(~Group, scales = "free") +
labs(x = "Log10[Size Factor]")
## normalization
sce_split <- logNormCounts(sce_split)
#assayNames(sce_split)
# ADT
clr_norm <- function(x) {
return(log1p(x = x / (exp(x = sum(log1p(x = x[x > 0]), na.rm = TRUE) / length(x = x)))))
}
logcounts(altExp(sce_split)) <- apply(counts(altExp(sce_split)), 2, clr_norm)
# determining variance components
dec <- modelGeneVar(sce_split)
dec[order(dec$bio, decreasing = TRUE), ]
## DataFrame with 431 rows and 6 columns
## mean total tech bio p.value FDR
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CCL4 4.51591 3.75062 0.850912 2.89971 2.79404e-33 1.03100e-30
## IL32 4.20734 3.26054 0.773740 2.48680 8.02914e-30 1.48138e-27
## CCL3 3.24819 2.92965 1.030847 1.89880 5.03180e-11 4.64184e-09
## FCGR3A 2.84808 2.97694 1.126733 1.85020 4.09905e-09 3.02510e-07
## CCL5 3.89221 2.44580 0.785547 1.66025 5.89577e-14 7.25180e-12
## ... ... ... ... ... ... ...
## PTPRC 2.20790 0.787051 1.192988 -0.405937 0.883852 0.944507
## TRBC2 2.59636 0.724354 1.159376 -0.435022 0.906107 0.952574
## LAMP1 2.08797 0.756541 1.203842 -0.447301 0.903936 0.952574
## HLA.A 4.83744 0.338797 0.953669 -0.614872 0.988191 0.990509
## HLA.C 5.64365 0.362781 1.005275 -0.642494 0.987569 0.990509
fit <- metadata(dec)
plot(fit$mean, fit$var,
xlab = "Mean of log-expression", ylab = "Variance of log-expression")
points(fit$mean, fit$var, col = "red", pch = 16)
curve(fit$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
Looking at the distribution of ADT markers (after normalization):
p <- do.call(plot_grid, c(lapply(rownames(logcounts(altExp(sce_split))), function(adt_name){
ggplot(as.data.frame(logcounts(altExp(sce_split))[adt_name, ]), aes_string(x=logcounts(altExp(sce_split))[adt_name, ])) +
geom_histogram(color="black",
fill = "black",
breaks=seq(0, 4.5, by=0.10)) +
xlab(adt_name) +
theme(axis.title.x = element_text(size = 8, face="bold"))}), ncol = 4))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## i Please use tidy evaluation ideoms with `aes()`
print(p)
We define a new column containing the condition of samples:
# creating new colData
sce_split$CAR <- ifelse(sce_split$Sample_Name %in% c("CARKOafter", "CARKObefore", "CARafter", "CARbefore"), "Yes", "No")
sce_split$KO <- ifelse(sce_split$Sample_Name %in% c("CARKOafter", "CARKObefore", "KOafter", "KObefore"), "Yes", "No")
sce_split$condition <- ifelse(sce_split$Sample_Name %in% c("CARKOafter", "CARKObefore"), "CARKO",
ifelse(sce_split$Sample_Name %in% c("CARafter", "CARbefore"), "CAR",
ifelse(sce_split$Sample_Name %in% c("KOafter", "KObefore"), "KO", "NT")))
sce_split$condition <- factor(sce_split$condition, levels = c("NT", "KO", "CAR", "CARKO"))
saveRDS(sce_split, "intermediate_files/D2/sce_donor2.Rds")
date()
## [1] "Thu Jan 26 11:21:13 2023"
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggrepel_0.9.2 edgeR_3.38.4
## [3] limma_3.52.4 pheatmap_1.0.12
## [5] cowplot_1.1.1 scDblFinder_1.10.0
## [7] patchwork_1.1.2 scater_1.24.0
## [9] scran_1.24.1 scuttle_1.6.3
## [11] SingleCellExperiment_1.18.1 SummarizedExperiment_1.26.1
## [13] Biobase_2.56.0 GenomicRanges_1.48.0
## [15] GenomeInfoDb_1.32.4 IRanges_2.30.1
## [17] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [19] MatrixGenerics_1.8.1 matrixStats_0.63.0
## [21] forcats_0.5.2 stringr_1.5.0
## [23] dplyr_1.0.10 purrr_1.0.1
## [25] readr_2.1.3 tidyr_1.3.0
## [27] tibble_3.1.8 ggplot2_3.4.0
## [29] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1
## [3] igraph_1.3.5 BiocParallel_1.30.4
## [5] crosstalk_1.2.0 digest_0.6.31
## [7] htmltools_0.5.4 viridis_0.6.2
## [9] fansi_1.0.4 magrittr_2.0.3
## [11] ScaledMatrix_1.4.1 googlesheets4_1.0.1
## [13] cluster_2.1.4 tzdb_0.3.0
## [15] Biostrings_2.64.1 modelr_0.1.10
## [17] timechange_0.2.0 colorspace_2.1-0
## [19] rvest_1.0.3 haven_2.5.1
## [21] xfun_0.36 crayon_1.5.2
## [23] RCurl_1.98-1.9 jsonlite_1.8.4
## [25] glue_1.6.2 gtable_0.3.1
## [27] gargle_1.2.1 zlibbioc_1.42.0
## [29] XVector_0.36.0 DelayedArray_0.22.0
## [31] BiocSingular_1.12.0 scales_1.2.1
## [33] DBI_1.1.3 Rcpp_1.0.10
## [35] isoband_0.2.7 viridisLite_0.4.1
## [37] dqrng_0.3.0 rsvd_1.0.5
## [39] DT_0.27 metapod_1.4.0
## [41] htmlwidgets_1.6.1 httr_1.4.4
## [43] RColorBrewer_1.1-3 ellipsis_0.3.2
## [45] pkgconfig_2.0.3 XML_3.99-0.13
## [47] farver_2.1.1 sass_0.4.5
## [49] dbplyr_2.3.0 locfit_1.5-9.7
## [51] utf8_1.2.2 tidyselect_1.2.0
## [53] labeling_0.4.2 rlang_1.0.6
## [55] munsell_0.5.0 cellranger_1.1.0
## [57] tools_4.2.2 cachem_1.0.6
## [59] xgboost_1.7.3.1 cli_3.6.0
## [61] generics_0.1.3 broom_1.0.2
## [63] evaluate_0.20 fastmap_1.1.0
## [65] yaml_2.3.7 knitr_1.41
## [67] fs_1.6.0 sparseMatrixStats_1.8.0
## [69] xml2_1.3.3 compiler_4.2.2
## [71] rstudioapi_0.14 beeswarm_0.4.0
## [73] reprex_2.0.2 statmod_1.5.0
## [75] bslib_0.4.2 stringi_1.7.12
## [77] highr_0.10 lattice_0.20-45
## [79] bluster_1.6.0 Matrix_1.5-3
## [81] vctrs_0.5.2 pillar_1.8.1
## [83] lifecycle_1.0.3 jquerylib_0.1.4
## [85] BiocNeighbors_1.14.0 data.table_1.14.6
## [87] bitops_1.0-7 irlba_2.3.5.1
## [89] rtracklayer_1.56.1 R6_2.5.1
## [91] BiocIO_1.6.0 gridExtra_2.3
## [93] vipor_0.4.5 codetools_0.2-18
## [95] MASS_7.3-58.1 assertthat_0.2.1
## [97] rjson_0.2.21 withr_2.5.0
## [99] GenomicAlignments_1.32.1 Rsamtools_2.12.0
## [101] GenomeInfoDbData_1.2.8 parallel_4.2.2
## [103] hms_1.1.2 grid_4.2.2
## [105] beachmat_2.12.0 rmarkdown_2.20
## [107] DelayedMatrixStats_1.18.2 googledrive_2.0.0
## [109] lubridate_1.9.1 ggbeeswarm_0.7.1
## [111] restfulr_0.0.15
knitr::knit_exit()